library(tidyverse)
library(DT)
names <- read_csv(here::here("supporting_artifacts",
"learning_targets",
"Lab 9",
"StateNames_A.csv"))Challenge 9
Data Import and Packages
datatable(names)Warning in instance$preRenderHook(instance): It seems your data is too big
for client-side DataTables. You may consider server-side processing: https://
rstudio.github.io/DT/server.html
Part 1: Summarizing and Visualizing Allisons
Question 1
names <- names |>
mutate(Sex = Gender)
allisonname <- names |>
select(Name, Year, State, Count, Sex) |>
group_by(State, Sex) |>
filter(Name == "Allison") |>
summarize(Count = sum(Count), .groups = "keep") |>
pivot_wider(names_from = Sex, values_from = Count) |>
mutate(M = coalesce(M, 0))
knitr::kable(allisonname[, 1:3],
col.names = c("State",
"Total Sum of Female",
"Total Sum of Male" ),
"html")| State | Total Sum of Female | Total Sum of Male |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
Question 2
allisonname_F <- names |>
filter(Name == "Allison", Sex == "F")Question 3
allisonname_f_byYear <- allisonname_F |>
group_by(Year) |>
summarize(Count = sum(Count))
ggplot(data = allisonname_f_byYear, mapping = aes(x = Year, y = Count)) +
geom_point() +
geom_line() +
labs(title = 'Popularity of the name "Allison" over time')
Part 2: Modeling the Number of Allisons
Question 4
Model1 <- allisonname_f_byYear |>
lm(Count ~ Year, data = _)Question 5
Model1 |>
ggplot(mapping = aes(y = Count, x = Year)) +
geom_point() +
stat_smooth(method = "lm")`geom_smooth()` using formula 'y ~ x'

Question 6
lm(Count ~ Year, data = allisonname_f_byYear)
Call:
lm(formula = Count ~ Year, data = allisonname_f_byYear)
Coefficients:
(Intercept) Year
209689.8 -101.5
y-hat (estimated Count of Allisons) = 209,689.8 - 101.5 (Year)
Question 7
Model1 |>
broom::augment() |>
ggplot(mapping = aes(y = .resid, x = .fitted)) +
geom_point()
In the plot of the residuals against the fitted values, we do not see any discernible pattern.
Question 8
Our model shows that the name Allison is declining in popularity. Allison is still quite popular as about 5000 newborn babies were given that name in our most recent year of data.
Part 3: Spelling by State
Question 1
names |>
filter(Sex == "M", Name %in% c("Allan", "Alan", "Allen")) |>
group_by(Year, Name) |>
summarize(Count = sum(Count)) |>
ggplot(mapping = aes(x = Year, y = Count, color = Name)) +
geom_point() +
geom_line() +
labs(title = 'Popularity of the name "Allen, Allan, Alan" over time')
Question 2
alan_name_M <- names |>
filter(Sex == "M", Name %in% c("Allan", "Alan", "Allen"),
Year == 2000, State %in% c("PA", "CA")) |>
pivot_wider(names_from = Name, values_from = Count) |>
select(State, Alan, Allen, Allan)
alan_name_M# A tibble: 2 × 4
State Alan Allen Allan
<chr> <dbl> <dbl> <dbl>
1 CA 579 176 131
2 PA 51 56 12
knitr::kable(alan_name_M[, 1:4],
col.names = c("State",
"Count of Alan",
"Count of Allen",
"Count of Allan"),
"html")| State | Count of Alan | Count of Allen | Count of Allan |
|---|---|---|---|
| CA | 579 | 176 | 131 |
| PA | 51 | 56 | 12 |
Question 3
alan_name_M_per <- names |>
filter(Sex == "M", Name %in% c("Allan", "Alan", "Allen"),
Year == 2000, State %in% c("PA", "CA")) |>
group_by(State) |>
mutate(Count = Count/sum(Count)) |>
pivot_wider(names_from = Name, values_from = Count) |>
select(State, Alan, Allen, Allan)
alan_name_M_per# A tibble: 2 × 4
# Groups: State [2]
State Alan Allen Allan
<chr> <dbl> <dbl> <dbl>
1 CA 0.653 0.199 0.148
2 PA 0.429 0.471 0.101
knitr::kable(alan_name_M_per[, 1:4],
col.names = c("State",
"Percent by State named Alan",
"Percent by State named Allen",
"Percent by State named Allan") ,
"html") %>%
kableExtra::kable_styling(latex_options = "striped", font_size = 13)%>%
kableExtra::row_spec(1:2, color = 'white', background = 'black')| State | Percent by State named Alan | Percent by State named Allen | Percent by State named Allan |
|---|---|---|---|
| CA | 0.6534989 | 0.1986456 | 0.1478555 |
| PA | 0.4285714 | 0.4705882 | 0.1008403 |